library(ggplot2)
library(ggpubr)
## Loading required package: magrittr
library(plyr)
##
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
##
## mutate
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:plyr':
##
## here
## The following object is masked from 'package:base':
##
## date
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(scales)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
LOAD DATA
movies <- get(load("movies.RData"))
head(movies)
summary(movies)
## title title_type genre
## Length:651 Documentary : 55 Drama :305
## Class :character Feature Film:591 Comedy : 87
## Mode :character TV Movie : 5 Action & Adventure: 65
## Mystery & Suspense: 59
## Documentary : 52
## Horror : 23
## (Other) : 60
## runtime mpaa_rating studio
## Min. : 39.0 G : 19 Paramount Pictures : 37
## 1st Qu.: 92.0 NC-17 : 2 Warner Bros. Pictures : 30
## Median :103.0 PG :118 Sony Pictures Home Entertainment: 27
## Mean :105.8 PG-13 :133 Universal Pictures : 23
## 3rd Qu.:115.8 R :329 Warner Home Video : 19
## Max. :267.0 Unrated: 50 (Other) :507
## NA's :1 NA's : 8
## thtr_rel_year thtr_rel_month thtr_rel_day dvd_rel_year
## Min. :1970 Min. : 1.00 Min. : 1.00 Min. :1991
## 1st Qu.:1990 1st Qu.: 4.00 1st Qu.: 7.00 1st Qu.:2001
## Median :2000 Median : 7.00 Median :15.00 Median :2004
## Mean :1998 Mean : 6.74 Mean :14.42 Mean :2004
## 3rd Qu.:2007 3rd Qu.:10.00 3rd Qu.:21.00 3rd Qu.:2008
## Max. :2014 Max. :12.00 Max. :31.00 Max. :2015
## NA's :8
## dvd_rel_month dvd_rel_day imdb_rating imdb_num_votes
## Min. : 1.000 Min. : 1.00 Min. :1.900 Min. : 180
## 1st Qu.: 3.000 1st Qu.: 7.00 1st Qu.:5.900 1st Qu.: 4546
## Median : 6.000 Median :15.00 Median :6.600 Median : 15116
## Mean : 6.333 Mean :15.01 Mean :6.493 Mean : 57533
## 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:7.300 3rd Qu.: 58300
## Max. :12.000 Max. :31.00 Max. :9.000 Max. :893008
## NA's :8 NA's :8
## critics_rating critics_score audience_rating audience_score
## Certified Fresh:135 Min. : 1.00 Spilled:275 Min. :11.00
## Fresh :209 1st Qu.: 33.00 Upright:376 1st Qu.:46.00
## Rotten :307 Median : 61.00 Median :65.00
## Mean : 57.69 Mean :62.36
## 3rd Qu.: 83.00 3rd Qu.:80.00
## Max. :100.00 Max. :97.00
##
## best_pic_nom best_pic_win best_actor_win best_actress_win best_dir_win
## no :629 no :644 no :558 no :579 no :608
## yes: 22 yes: 7 yes: 93 yes: 72 yes: 43
##
##
##
##
##
## top200_box director actor1 actor2
## no :636 Length:651 Length:651 Length:651
## yes: 15 Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## actor3 actor4 actor5
## Length:651 Length:651 Length:651
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## imdb_url rt_url
## Length:651 Length:651
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
TRANSFORM DATA
#Runtime:
movies[!complete.cases(movies['runtime']),]
movies$runtime[movies$title == "The End of America"] <- 74
#https://www.imdb.com/title/tt1294790/
#Director:
movies$director[movies$title == "Lorenzo's Oil"] <- "George Miller"
#https://es.wikipedia.org/wiki/Lorenzo%27s_Oil_(pel%C3%ADcula)
movies$director[movies$title == "The Ninth Gate"] <- "Roman Polanski"
#https://es.wikipedia.org/wiki/The_Ninth_Gate
#Studio:
movies$studio <- as.character(movies$studio)
movies$studio[movies$title == "Oliver & Company"] <-
"Walt Disney Pictures"
#https://es.wikipedia.org/wiki/Oliver_y_su_pandilla
movies$studio[movies$title == "Attack of the 50 Foot Woman"] <-
"Woolner Brothers Pictures Inc."
#https://www.imdb.com/title/tt0051380/
movies$studio[movies$title == "Inbred"] <- "Melanie Light"
#https://www.imdb.com/title/tt1723124/fullcredits
movies$studio[movies$title == "Caveman"] <- "United Artists"
#https://es.wikipedia.org/wiki/El_cavern%C3%ADcola
movies$studio[movies$title == "Dirty Sanchez: The Movie"] <-
"Vertigo Films"
#https://www.rottentomatoes.com/m/dirty_sanchez
movies$studio[movies$title == "The Man Who Sued God"] <-
"Australian Film Finance Corporation (AFFC), New South Wales Film & Television Office, Showtime Australia See more"
#https://www.imdb.com/title/tt0268437/
movies$studio[movies$title == "Inserts"] <-
"Film and General Productions"
#https://www.filmaffinity.com/es/film740308.html
movies$studio <- factor(movies$studio)
#DVD realease date
movies[!complete.cases(movies['dvd_rel_year']), ]
movies$dvd_rel_year[movies$title == "Charlie: The Life and Art of Charles Chaplin"] <-
2003
movies$dvd_rel_month[movies$title == "Charlie: The Life and Art of Charles Chaplin"] <-
11
movies$dvd_rel_day[movies$title == "Charlie: The Life and Art of Charles Chaplin"] <-
5
#https://www.imdb.com/title/tt0379730/releaseinfo
movies$dvd_rel_year[movies$title == "The Squeeze"] <- 2015
movies$dvd_rel_month[movies$title == "The Squeeze"] <- 6
movies$dvd_rel_day[movies$title == "The Squeeze"] <- 9
#https://medium.com/@releasebandyal/producer-michael-doven-announces-release-of-the-squeeze-on-dvd-75a982d0a047
movies$dvd_rel_year[movies$title == "Electric Dreams"] <- 1984
#https://en.wikipedia.org/wiki/Electric_Dreams_(film)
movies$dvd_rel_year[movies$title == "The Last Remake of Beau Geste"] <-
2010
movies$dvd_rel_month[movies$title == "The Last Remake of Beau Geste"] <-
1
movies$dvd_rel_day[movies$title == "The Last Remake of Beau Geste"] <-
11
#https://en.wikipedia.org/wiki/The_Last_Remake_of_Beau_Geste
#Actors:
movies[!complete.cases(movies['actor4']), ]
movies$actor4[movies$title == "Attack of the 50 Foot Woman"] <-
"Roy Gordon"
#https://www.imdb.com/title/tt0051380/fullcredits/?ref_=tt_ov_st_sm
movies[!complete.cases(movies['actor5']), ]
movies$actor5[movies$title == "Attack of the 50 Foot Woman"] <-
"George Douglas"
#https://www.imdb.com/title/tt0051380/fullcredits/?ref_=tt_ov_st_sm
movies$actor5[movies$title == "The Illusionist (L'illusionniste)"] <-
"Eleanor Tomlinson"
#https://en.wikipedia.org/wiki/The_Illusionist_(2006_film)
#Column manipulation
movies$thtr_rel_date <- as.Date(paste0(movies$thtr_rel_year, "-", movies$thtr_rel_month, "-", movies$thtr_rel_day))
movies$dvd_rel_date <- as.Date(paste0(movies$dvd_rel_year, "-", movies$dvd_rel_month, "-", movies$dvd_rel_day))
movies$thtr_rel_decade <- as.numeric(format(movies$thtr_rel_date,"%Y")) - (as.numeric(format(movies$thtr_rel_date,"%Y")) %% 10)
movies$thtr_rel_decade <- as.factor(movies$thtr_rel_decade)
movies$top200_box <- NULL
movies$dvd_rel_day <- NULL
movies$dvd_rel_month <- NULL
movies$dvd_rel_year <- NULL
#movies$thtr_rel_day <- NULL
#movies$thtr_rel_month <- NULL
#movies$thtr_rel_year <- NULL
movies$imdb_url <- NULL
movies$rt_url <- NULL
head(movies)
summary(movies)
## title title_type genre
## Length:651 Documentary : 55 Drama :305
## Class :character Feature Film:591 Comedy : 87
## Mode :character TV Movie : 5 Action & Adventure: 65
## Mystery & Suspense: 59
## Documentary : 52
## Horror : 23
## (Other) : 60
## runtime mpaa_rating studio
## Min. : 39.0 G : 19 Paramount Pictures : 37
## 1st Qu.: 92.0 NC-17 : 2 Warner Bros. Pictures : 30
## Median :103.0 PG :118 Sony Pictures Home Entertainment: 27
## Mean :105.8 PG-13 :133 Universal Pictures : 23
## 3rd Qu.:115.5 R :329 Warner Home Video : 19
## Max. :267.0 Unrated: 50 (Other) :514
## NA's : 1
## thtr_rel_year thtr_rel_month thtr_rel_day imdb_rating
## Min. :1970 Min. : 1.00 Min. : 1.00 Min. :1.900
## 1st Qu.:1990 1st Qu.: 4.00 1st Qu.: 7.00 1st Qu.:5.900
## Median :2000 Median : 7.00 Median :15.00 Median :6.600
## Mean :1998 Mean : 6.74 Mean :14.42 Mean :6.493
## 3rd Qu.:2007 3rd Qu.:10.00 3rd Qu.:21.00 3rd Qu.:7.300
## Max. :2014 Max. :12.00 Max. :31.00 Max. :9.000
##
## imdb_num_votes critics_rating critics_score audience_rating
## Min. : 180 Certified Fresh:135 Min. : 1.00 Spilled:275
## 1st Qu.: 4546 Fresh :209 1st Qu.: 33.00 Upright:376
## Median : 15116 Rotten :307 Median : 61.00
## Mean : 57533 Mean : 57.69
## 3rd Qu.: 58300 3rd Qu.: 83.00
## Max. :893008 Max. :100.00
##
## audience_score best_pic_nom best_pic_win best_actor_win best_actress_win
## Min. :11.00 no :629 no :644 no :558 no :579
## 1st Qu.:46.00 yes: 22 yes: 7 yes: 93 yes: 72
## Median :65.00
## Mean :62.36
## 3rd Qu.:80.00
## Max. :97.00
##
## best_dir_win director actor1 actor2
## no :608 Length:651 Length:651 Length:651
## yes: 43 Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## actor3 actor4 actor5
## Length:651 Length:651 Length:651
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## thtr_rel_date dvd_rel_date thtr_rel_decade
## Min. :1970-05-20 Min. :1991-03-28 1970: 51
## 1st Qu.:1990-12-07 1st Qu.:2001-05-15 1980:103
## Median :2000-09-15 Median :2004-03-02 1990:161
## Mean :1998-06-15 Mean :2004-12-06 2000:234
## 3rd Qu.:2007-05-17 3rd Qu.:2008-02-19 2010:102
## Max. :2014-12-25 Max. :2015-06-09
## NA's :5
We chose the “Films” dataset and we are going to answer the following questions that we thought were interesting.
g1 <- ggplot(data=movies, aes(x=audience_score)) + geom_histogram(bins=30, aes(fill = title_type), position = "identity", alpha = 0.7) + scale_color_manual(values = c("#00AFBB", "#E7B800", "#0AFBB0")) + scale_fill_manual(values = c("#00AFBB", "#E7B800", "#0AFBB0")) + theme(legend.position = "none")
g2 <- ggplot(data=movies, aes(x=critics_score)) + geom_histogram(bins=30, aes(fill = title_type), position = "identity", alpha = 0.7) + scale_color_manual(values = c("#00AFBB", "#E7B800", "#0AFBB0")) + scale_fill_manual(values = c("#00AFBB", "#E7B800", "#0AFBB0")) + theme(legend.position = "none")
g3 <- ggplot(data=movies, aes(x=imdb_rating)) + geom_histogram(bins=30, aes(fill = title_type), position = "identity", alpha = 0.7) + scale_color_manual(values = c("#00AFBB", "#E7B800", "#0AFBB0")) + scale_fill_manual(values = c("#00AFBB", "#E7B800", "#0AFBB0"))
ggarrange(
g1,
g2,
g3,
ncol = 3,
nrow = 1,
widths = c(200,200,400),
labels = c("1", "2", "3")
)
g1 <- ggplot(data=movies, aes(x=audience_score)) + geom_density(aes(color = title_type), kernel = "gaussian")
g2 <- ggplot(data=movies, aes(x=critics_score)) + geom_density(aes(color = title_type), kernel = "gaussian")
g3 <- ggplot(data=movies, aes(x=imdb_rating)) + geom_density(aes(color = title_type), kernel = "gaussian")
ggarrange(
g1,
g2,
g3,
ncol = 3,
nrow = 1,
widths = c(200,200,400)
)
ggplot(data=movies, aes(x = reorder(genre, audience_score, median, order=TRUE), y = audience_score, group = genre, fill = genre)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw() + coord_flip()
ggplot(data=movies, aes(x = reorder(genre, critics_score, median, order=TRUE), y = critics_score, group = genre, fill = genre)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw() + coord_flip()
ggplot(data=movies, aes(x = reorder(genre, imdb_rating, median, order=TRUE), y = imdb_rating, group = genre, fill = genre)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw() + coord_flip()
score = list();
score$genre_score_agg = list(
"critics" = aggregate(critics_score ~ genre , movies, mean),
"audience" = aggregate(audience_score ~ genre, movies, mean),
"imdb" = aggregate(imdb_rating ~ genre, movies, mean)
)
score$genre_metrics_grid = expand.grid(year =sort(unique(movies$genre)),
metrics = c("Critic", "Audience", "IMDB"))
score$genre_metrics_grid$genre_score_agg =c(score$genre_score_agg$critics$critics_score,
score$genre_score_agg$audience$audience_score,
score$genre_score_agg$imdb$imdb_rating)
score$genre_metrics_grid$genre_score_agg_rescaled =
c(
rescale(score$genre_score_agg$critics$critics_score, from=(c(1,100))),
rescale(score$genre_score_agg$audience$audience_score, from=(c(1,100))),
rescale(score$genre_score_agg$imdb$imdb_rating, from=(c(1,10))))
ggplot(score$genre_metrics_grid)+
geom_tile(aes(x=metrics, y=year,fill=genre_score_agg_rescaled))+
scale_fill_distiller( limits=c(0,1), palette = "PuOr") +
labs(title = "Comparision of scores across genres (Heatmap)", x = "", y="", fill="")+
theme(plot.title = element_text(hjust = 0.5))
To compare the three different score types: Critic, Audience and IMDB to be comaral they must be rescaled as while Critic and Audience are of the range 1-100, IMDB is 1-10. After rescaling, to the range 0-1, a visual comparison of the different groups can be seen by arranging them on a heatmap.
The most striking feature is the lack of high scores for most genres, except for Documentary and Musical & Performing Arts. From the three groups Critics seem to be the most negative in their scoring, while Audience and IMDB to be more aligned. This clearly seen in Action & Adventure genre where Critics have a far lower score than the other two groups. This may be due to Critics having a different criteria on what constitutes a “good” movie but without details on the required metrics this is only speculation. Conversely though IMBD scores are most unaligned with the other groups for the two highest scoring genres, the reasoning for this can not be determined from the data available.
mean_score <-
data.frame(
"Score" = c(
mean(movies$imdb_rating)*10,
mean(movies$audience_score),
mean(movies$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_all <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(), axis.text.y = element_blank()) + ggtitle("All genres")
#Action and adventure
action_adventure <- subset(movies, genre == "Action & Adventure")
mean_score <-
data.frame(
"Score" = c(
mean(action_adventure$imdb_rating)*10,
mean(action_adventure$audience_score),
mean(action_adventure$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_action <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("Action & Adventure")
#Animation
animation <- subset(movies, genre == "Animation")
mean_score <-
data.frame(
"Score" = c(
mean(animation$imdb_rating)*10,
mean(animation$audience_score),
mean(animation$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_animation <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("Animation")
#Art House & International
art_house <- subset(movies, genre == "Art House & International")
mean_score <-
data.frame(
"Score" = c(
mean(art_house$imdb_rating)*10,
mean(art_house$audience_score),
mean(art_house$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_art_house <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(),axis.text.y = element_blank()) + ggtitle("Art House & International")
#Comedy
comedy <- subset(movies, genre == "Comedy")
mean_score <-
data.frame(
"Score" = c(
mean(comedy$imdb_rating)*10,
mean(comedy$audience_score),
mean(comedy$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_comedy <-ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("Comedy")
#Documentary
documentary <- subset(movies, genre == "Documentary")
mean_score <-
data.frame(
"Score" = c(
mean(documentary$imdb_rating)*10,
mean(documentary$audience_score),
mean(documentary$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_documentary <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("Documentary")
#Drama
drama <- subset(movies, genre == "Drama")
mean_score <-
data.frame(
"Score" = c(
mean(drama$imdb_rating)*10,
mean(drama$audience_score),
mean(drama$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_drama <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(),axis.text.y = element_blank()) + ggtitle("Drama")
#Horror
horror <- subset(movies, genre == "Horror")
mean_score <-
data.frame(
"Score" = c(
mean(horror$imdb_rating)*10,
mean(horror$audience_score),
mean(horror$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_horror <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("Horror")
#Musical & Performing Arts
musical <- subset(movies, genre == "Musical & Performing Arts")
mean_score <-
data.frame(
"Score" = c(
mean(musical$imdb_rating)*10,
mean(musical$audience_score),
mean(musical$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_musical <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.text.y = element_blank()) + ggtitle("Musical & Performing Arts")
#Mystery & Suspense
mistery <- subset(movies, genre == "Mystery & Suspense")
mean_score <-
data.frame(
"Score" = c(
mean(mistery$imdb_rating)*10,
mean(mistery$audience_score),
mean(mistery$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_mistery <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("Mystery & Suspense")
#Other
other <- subset(movies, genre == "Other")
mean_score <-
data.frame(
"Score" = c(
mean(other$imdb_rating)*10,
mean(other$audience_score),
mean(other$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_other <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("Other")
#Science Fiction & Fantasy
science <- subset(movies, genre == "Science Fiction & Fantasy")
mean_score <-
data.frame(
"Score" = c(
mean(science$imdb_rating)*10,
mean(science$audience_score),
mean(science$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_science <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("Science Fiction & Fantasy")
#Combine the plots...
plot_combined_genre <-ggarrange(
plot_all,
plot_action,
plot_animation,
plot_art_house,
plot_comedy,
plot_documentary,
plot_drama,
plot_horror,
plot_mistery,
plot_musical,
plot_other,
plot_science,
ncol = 3,
nrow = 4,common.legend = TRUE, legend="bottom")+
labs(title = "Comparision of scores across genres (Lollipop chart)", x = "", y="", fill="")+
theme(plot.title = element_text(hjust = 0.5))
plot_combined_genre
Taking a look at the different lollipop charts, we can see different things:
Taking this into account, we can say that the pattern observed in previously is maintained in the individual genres. Critics give the lowest ratings for almost all genres and IMDB and Audience alternate in giving the highest score. There is a difference in the raitings of the different sources according to the genre and this difference follows a pattern, which is more accentuated in some genres than in others.
mean_score <-
data.frame(
"Score" = c(
mean(movies$imdb_rating)*10,
mean(movies$audience_score),
mean(movies$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_mpaa_all <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0,84) + theme(legend.position = "none",axis.title.x = element_blank(), axis.text.y = element_blank()) + ggtitle("All MPAA ratings")
#G
g <- subset(movies, mpaa_rating == "G")
mean_score <-
data.frame(
"Score" = c(
mean(g$imdb_rating)*10,
mean(g$audience_score),
mean(g$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_g <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0,84) + theme(legend.position = "none",axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("G")
#NC-17
nc_17 <- subset(movies, mpaa_rating == "NC-17")
mean_score <-
data.frame(
"Score" = c(
mean(nc_17$imdb_rating)*10,
mean(nc_17$audience_score),
mean(nc_17$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_nc_17 <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0,84) + theme(legend.position = "none",axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("NC-17")
#PG
pg <- subset(movies, mpaa_rating == "PG")
mean_score <-
data.frame(
"Score" = c(
mean(pg$imdb_rating)*10,
mean(pg$audience_score),
mean(pg$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_pg <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0,84) + theme(legend.position = "none",axis.title.x = element_blank(),axis.text.y = element_blank()) + ggtitle("PG")
#PG-13
pg_13 <- subset(movies, mpaa_rating == "PG-13")
mean_score <-
data.frame(
"Score" = c(
mean(pg_13$imdb_rating)*10,
mean(pg_13$audience_score),
mean(pg_13$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_pg_13 <-ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0,84) + theme(legend.position = "none",axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("PG-13")
#R
r <- subset(movies, mpaa_rating == "R")
mean_score <-
data.frame(
"Score" = c(
mean(r$imdb_rating)*10,
mean(r$audience_score),
mean(r$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_r <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0,84) + theme(legend.position = "none",axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("R")
#Unrated
unrated <- subset(movies, mpaa_rating == "Unrated")
mean_score <-
data.frame(
"Score" = c(
mean(unrated$imdb_rating)*10,
mean(unrated$audience_score),
mean(unrated$critics_score)
),
"Source" = c("IMDB", "Audience", "Critics")
)
plot_unrated <- ggplot(mean_score, aes(Score, Source)) +
geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
geom_point(aes( colour=Source) ) + xlim(0,84) + theme(legend.position = "none",axis.text.y = element_blank()) + ggtitle("Unrated")
#Combine the plots...
plot_combined_mpaa <-ggarrange(
plot_mpaa_all,
plot_g,
plot_nc_17,
plot_pg,
plot_pg_13,
plot_r,
plot_unrated,
ncol = 3,
nrow = 3,common.legend = TRUE, legend="bottom")+
labs(title = "Comparision of scores across MPAA ratings (Lollipop chart)", x = "", y="", fill="")+
theme(plot.title = element_text(hjust = 0.5))
plot_combined_mpaa
If we now make a similar comparisson for the MPAA ratings, we can see the following: * Audience gives a higher score than Critics in 4/6 MPAA ratings (66.6%). The two MPAA ratings where Critics give a higher score are NC-17 and Unrated. * IMDB gives a higher score than Critics in 4/6 MPAA ratings (66.6%). Again, the two exceptions are NC-17 and Unrated. * IMDB tends to give a higher score than Audience. This happens, again, in 4/6 MPAA ratings (66.6%). The MPAA ratings where this does not happen are G and Unrated. * Exceptuating G and PG, the rest of the MPAA ratings receive clearly different, although not highly, scores from the three different sources.
From these findings we can see that overall IMDB and the Audience tend to give more positive reviews than the Critics. However, the scores given by the three sources are not that different one from the other in the majority of the MPAA ratings (the only exceptions being the extreme Critics score for NC-17 and PG-13). Therefore, there is not enough evidence to say that there is a significant difference in the score according to the MPAA rating.
ggplot(data=movies, aes(x = best_pic_nom, y = audience_score, group = best_pic_nom, fill = best_pic_nom)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw()
ggplot(data=movies, aes(x = best_actor_win, y = audience_score, group = best_actor_win, fill = best_actor_win)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw()
ggplot(data=movies, aes(x = best_actress_win, y = audience_score, group = best_actress_win, fill = best_actress_win)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw()
ggplot(data=movies, aes(x = best_dir_win, y = audience_score, group = best_dir_win, fill = best_dir_win)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw()
ggplot(data=movies, aes(x = best_pic_nom, y = audience_score, group = best_pic_nom, fill = best_pic_nom)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw()
ggplot(data=movies, aes(x = best_pic_nom, y = critics_score, group = best_pic_nom, fill = best_pic_nom)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw()
ggplot(data=movies, aes(x = best_pic_nom, y = imdb_rating, group = best_pic_nom, fill = best_pic_nom)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw()
ggplot(aes(x=best_pic_nom), data=movies[movies$best_pic_win=="yes",]) + geom_bar()
movies[movies$best_pic_nom=="no" & movies$best_pic_win=="yes",]
movies[movies$best_pic_nom=="no" & movies$best_pic_win=="yes",]$best_pic_nom <- "yes"
ggplot(aes(x=best_actor_win), data=movies[movies$best_pic_nom=="yes",]) + geom_bar()
ggplot(aes(x=best_actress_win), data=movies[movies$best_pic_nom=="yes",]) + geom_bar()
ggplot(aes(x=best_dir_win), data=movies[movies$best_pic_nom=="yes",]) + geom_bar()
best_pic_movies <- subset(movies, c(best_pic_win == "yes"))
#Add a column showing if they have won another award
best_pic_movies$other_win <- NA
if (best_pic_movies$best_actor_win == "yes" ||
best_pic_movies$best_actress_win == "yes" ||
best_pic_movies$best_dir_win == "yes") {
best_pic_movies$other_win <- "yes"
} else{
best_pic_movies$other_win <- "no"
}
#Plots...
#Best movies that have won another award
best_movie_another_award <- plyr::count(best_pic_movies$other_win)
best_movie_another_award <- best_movie_another_award %>%
arrange((freq)) %>%
mutate(lab.ypos = cumsum(freq) - 0.5 * freq)
pie_another_award <-
ggplot(best_movie_another_award, aes(x = "", y = freq, fill = x)) +
geom_bar(width = 1,
stat = "identity",
color = "white") +
coord_polar("y", start = 0) +
theme_void() +
geom_text(aes(label = signif((freq / 7) * 100), digits = 2),
position = position_stack(vjust = 0.5),
color = "white") +
labs(fill = "Another award") +
ggtitle("Best movies w/ awarded member") +
theme(plot.title = element_text(size = 12, hjust = 0.5)) +
scale_fill_manual(values = "#00BFC4")
## Warning: Ignoring unknown aesthetics: digits
#Best movies that have won best actor award
best_movie_best_actor <- plyr::count(best_pic_movies$best_actor_win)
best_movie_best_actor <- best_movie_best_actor %>%
arrange((freq)) %>%
mutate(lab.ypos = cumsum(freq) - 0.5 * freq)
pie_best_actor <-
ggplot(best_movie_best_actor, aes(x = "", y = freq, fill = x)) +
geom_bar(width = 1,
stat = "identity",
color = "white") +
coord_polar("y", start = 0) +
theme_void() +
geom_text(aes(label = signif((freq / 7) * 100), digits = 2),
position = position_stack(vjust = 0.5),
color = "white") +
labs(fill = "Best actor") +
ggtitle("Best movies w/ best actor") +
theme(plot.title = element_text(size = 12, hjust = 0.5))
## Warning: Ignoring unknown aesthetics: digits
#Best movies that have won best actress award
best_movie_best_actress <- plyr::count(best_pic_movies$best_actress_win)
best_movie_best_actress <- best_movie_best_actress %>%
arrange((freq)) %>%
mutate(lab.ypos = cumsum(freq) - 1 * freq)
pie_best_actress <-
ggplot(best_movie_best_actress, aes(x = "", y = freq, fill = x)) +
geom_bar(width = 1,
stat = "identity",
color = "white") +
coord_polar("y", start = 0) +
theme_void() +
geom_text(aes(label = signif((freq / 7) * 100), digits = 2),
position = position_stack(vjust = 0.5),
color = "white") +
labs(fill = "Best actress") +
ggtitle("Best movies w/ best actress") +
theme(plot.title = element_text(size = 12, hjust = 0.5))
## Warning: Ignoring unknown aesthetics: digits
#Best movies that have won best director award
best_movie_best_director <- plyr::count(best_pic_movies$best_dir_win)
best_movie_best_director <- best_movie_best_director %>%
arrange((freq)) %>%
mutate(lab.ypos = cumsum(freq) - 0.9 * freq)
pie_best_director <-
ggplot(best_movie_best_director, aes(x = "", y = freq, fill = x)) +
geom_bar(width = 1,
stat = "identity",
color = "white") +
coord_polar("y", start = 0) +
theme_void() +
geom_text(aes(label = signif((freq / 7) * 100), digits = 2),
position = position_stack(vjust = 0.5),
color = "white") +
labs(fill = "Best director") +
ggtitle("Best movies w/ best director") +
theme(plot.title = element_text(size = 12, hjust = 0.5))
## Warning: Ignoring unknown aesthetics: digits
#Combine plots!
plot_combined_awarded <-
ggarrange(
pie_another_award,
pie_best_actor,
pie_best_actress,
pie_best_director,
ncol = 2,
nrow = 2,
widths = c(400,400,400,400)
)+
labs(title = "Proportion of awards in the best movies (Pie chart)", x = "", y="", fill="")+
theme(plot.title = element_text(hjust = 0.5))
plot_combined_awarded
Looking at the pie charts, we can see that the movies in the dataset that have received the award for best film always have a cast/crew member with another award. It must be taken into account that the actors, actresses and directors could have received their award at any point in their career, not necessarily for the awarded film in which they participate. That is, the pie charts are not meant to show that if a cast/crew member receives an award for one film, that film is more likely to be awarded itself. However, the plots can serve to study the relationship between having high quality proffesionals working in a movie and that movie receiving an award.
Considering this, we can see that most awarded movies in this dataset have a director that has or will receive an award. The presence of awarded directors in 6/7 movies highly surpasses the proportion of awarded movies with awarded actors and actresses. This hints that the quality of the director is mor influent than that of the actors and actresses in determining if the movie is awarded for best film. This deduction makes sense, since actors are only responsible for playing their roles whereas the director’s decisions involve all the elements of a film.
# -are oscar-awarded films more liked?
#Audience score vs Best_pic_win
plot_audience_vs_best_pic <-
ggplot(movies,
aes(x = best_pic_win, y = audience_score, fill = best_pic_win)) + geom_boxplot() + theme(legend.position = "none") + labs(title = "Audience", x = "Best picture award", y = "Score")
#Critics score vs Best_pic_win
plot_critics_vs_best_pic <-
ggplot(movies,
aes(x = best_pic_win, y = critics_score, fill = best_pic_win)) + geom_boxplot() + theme(legend.position = "none") + labs(title = "Critics", x = "Best picture award", y = "Score")
#IMDB score vs Best_pic_win
plot_imdb_vs_best_pic <-
ggplot(movies, aes(x = best_pic_win, y = imdb_rating, fill = best_pic_win)) +
geom_boxplot() + theme(legend.position = "none") + labs(title = "IMDB", x = "Best picture award", y = "Score")
#Combine the plots...
plot_combined_best_pic <-
ggarrange(
plot_audience_vs_best_pic,
plot_critics_vs_best_pic,
plot_imdb_vs_best_pic,
ncol = 3,
nrow = 1
)
plot_combined_best_pic
#Audience score vs Best_dir_win
plot_audience_vs_best_dir <-
ggplot(movies,
aes(x = best_dir_win, y = audience_score, fill = best_dir_win)) + geom_boxplot() + theme(legend.position = "none") + labs(title = "Audience", x = "Best director award", y = "Score")
#Critics score vs Best_dir_win
plot_critics_vs_best_dir <-
ggplot(movies,
aes(x = best_dir_win, y = critics_score, fill = best_dir_win)) + geom_boxplot() + theme(legend.position = "none") + labs(title = "Critics", x = "Best director award", y = "Score")
#IMDB score vs Best_dir_win
plot_imdb_vs_best_dir <-
ggplot(movies, aes(x = best_dir_win, y = imdb_rating, fill = best_dir_win)) +
geom_boxplot() + theme(legend.position = "none") + labs(title = "IMDB", x = "Best director award", y = "Score")
#Combine the plots...
plot_combined_best_dir <-
ggarrange(
plot_audience_vs_best_dir,
plot_critics_vs_best_dir,
plot_imdb_vs_best_dir,
ncol = 3,
nrow = 1
)
plot_combined_best_dir
#Audience score vs Best_actor_win
plot_audience_vs_best_actor <-
ggplot(movies,
aes(x = best_actor_win, y = audience_score, fill = best_actor_win)) +
geom_boxplot() + theme(legend.position = "none") + labs(title = "Audience", x = "Best actor award", y = "Score")
#Critics score vs Best_dir_win
plot_critics_vs_best_actor <-
ggplot(movies,
aes(x = best_actor_win, y = critics_score, fill = best_actor_win)) + geom_boxplot() + theme(legend.position = "none") + labs(title = "Critics", x = "Best actor award", y = "Score")
#IMDB score vs Best_dir_win
plot_imdb_vs_best_actor <-
ggplot(movies,
aes(x = best_actor_win, y = imdb_rating, fill = best_actor_win)) + geom_boxplot() + theme(legend.position = "none") + labs(title = "IMDB", x = "Best actor award", y = "Score")
#Combine the plots...
plot_combined_best_actor <-
ggarrange(
plot_audience_vs_best_actor,
plot_critics_vs_best_actor,
plot_imdb_vs_best_actor,
ncol = 3,
nrow = 1
)
plot_combined_best_actor
#Audience score vs Best_actress_win
plot_audience_vs_best_actress <-
ggplot(movies,
aes(x = best_actress_win, y = audience_score, fill = best_actress_win)) +
geom_boxplot() + theme(legend.position = "none") + labs(title = "Audience", x = "Best actress award", y = "Score")
#Critics score vs Best_dir_win
plot_critics_vs_best_actress <-
ggplot(movies,
aes(x = best_actress_win, y = critics_score, fill = best_actress_win)) +
geom_boxplot() + theme(legend.position = "none") + labs(title = "Critics", x = "Best actress award", y = "Score")
#IMDB score vs Best_dir_win
plot_imdb_vs_best_actress <-
ggplot(movies,
aes(x = best_actress_win, y = imdb_rating, fill = best_actress_win)) +
geom_boxplot() + theme(legend.position = "none") + labs(title = "IMDB", x = "Best actress award", y = "Score")
#Combine the plots...
plot_combined_best_actress <-
ggarrange(
plot_audience_vs_best_actress,
plot_critics_vs_best_actress,
plot_imdb_vs_best_actress,
ncol = 3,
nrow = 1
)
plot_combined_best_actress
We start to count the movies per months to see if there is a pattern in all this time in the release months.
ggplot(movies, aes(fill = thtr_rel_decade)) + geom_bar(aes(x = thtr_rel_month), width = 0.5) + theme_minimal() + xlab("Month") + ylab("# of Films") + ggtitle("Number of Films released per month each decade ") + scale_fill_discrete(name = "Decade", labels=c("70's","80's","90's","00's","10's")) + scale_x_discrete(breaks=1:12,labels=c("Jan", "Feb", "Mar", "Apr","May", "Jun", "Jul", "Aug", "Sept", "Oct", "Nov", "Dec"))
We can see that overall there is a “pattern” of releases in the months of January, June, October and December, so let’s take a better look at this data by decade. We breakdown the previous graphs grouping the data by decade.
ggplot(movies, aes(fill = thtr_rel_decade)) + geom_bar(aes(x = thtr_rel_month), width = 0.5) + facet_wrap(thtr_rel_decade ~ .) + theme_minimal() + xlab("Month") + ylab("# of Films") + ggtitle("Number of Films released per month each decade w/ breakdown") + scale_fill_discrete(name = "Decade", labels=c("70's","80's","90's","00's","10's")) + scale_x_discrete(breaks=1:12,labels=c("Jan", "Feb", "Mar", "Apr","May", "Jun", "Jul", "Aug", "Sept", "Oct", "Nov", "Dec"))
> Now we can see clearly that since the 70’s there has been a trend of two moments in the years to release movies in theaters: in summer (June) and in the winter Holidays (December and January). However, in this last decade, we only see a spike in December.
ggplot(movies, aes(fill = thtr_rel_decade)) + geom_bar(aes(x = genre)) + facet_wrap(thtr_rel_decade ~ .) + theme_minimal() + xlab("Genre") + ylab("# of Films") + ggtitle("Genre of Films released each decade w/ breakdown") + scale_fill_discrete(name = "Decade", labels=c("70's","80's","90's","00's","10's")) + coord_flip()
We can see an increment over the years in the Documentary Genre, as well as in Mystery and Suspense.
Let’s see what is the amount of votes over the years that we have for all the films.
ggplot(movies, aes(fill = thtr_rel_date)) + geom_smooth(aes(x = thtr_rel_date, y = imdb_num_votes),method = 'loess', formula = 'y ~ x') + geom_point(aes(x = thtr_rel_date, y = imdb_num_votes)) + theme_minimal() + xlab("Date") + ylab("# of Votes") + ylim(min(movies$imdb_num_votes),quantile(movies$imdb_num_votes,.90)) + ggtitle("# of votes rating movies over the years") # + scale_fill_discrete(name = "Decade", labels=c("70's","80's","90's","00's","10's"))
## Warning: Removed 65 rows containing non-finite values (stat_smooth).
## Warning: Removed 65 rows containing missing values (geom_point).
Okay so it seems like the number of votes does increase over the years, but does it afect the overall score?
ggplot(movies, aes(fill = thtr_rel_date)) + geom_smooth(aes(x = thtr_rel_date, y = imdb_rating), method = 'loess', formula = 'y ~ x'
) + theme_minimal() + xlab("Date") + ylab("Rating") + ggtitle("Rating of movies over the years") # + scale_fill_discrete(name = "Decade", labels=c("70's","80's","90's","00's","10's"))
We can clearly see a decrease in overall score of this movies, but we can only asume that the newer the film, the more votes it gets and the worse it performs, but it could be because the quality in filmaking has woesened.
ggplot(data=movies, aes(x=thtr_rel_date, y=audience_score)) + geom_point(aes(col=best_pic_nom), size=3)
commonTheme = list(labs(color="Density",fill="Density", x="Date", y="Overall score"))
ggplot(data=movies,aes(thtr_rel_date, audience_score)) +
stat_density2d(aes(fill=..level..,alpha=..level..),geom='polygon',colour='black') +
scale_fill_continuous(low="green",high="red") +
guides(alpha="none") +
commonTheme
commonTheme = list(labs(color="Density",fill="Density", x="Runtime", y="Overall score"))
ggplot(data=movies,aes(runtime, audience_score)) +
stat_density2d(aes(fill=..level..,alpha=..level..),geom='polygon',colour='black') +
scale_fill_continuous(low="green",high="red") +
guides(alpha="none") +
commonTheme
best_dirs = subset(movies, movies$best_dir_win == "yes") %>%
group_by(director) %>%
filter(n() > 2)
win_years = data.frame(
year = c(2010, 2007,1987, 1990, 1978),
director=c("Kathryn Bigelow", "Martin Scorsese","Oliver Stone","Oliver Stone","Woody Allen")
)
win_year_intersects =
geom_vline(data = win_years, aes(xintercept=year, color = director), show.legend = FALSE)
p1 = ggplot(best_dirs, aes(y=critics_score, x=thtr_rel_year, color=director))+
geom_point()+
geom_line(show.legend = FALSE) +
labs(title = "Oscar winning director movie scores", y = "Critics Score", x = "", color="Movie Released")+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
win_year_intersects
p2 = ggplot(best_dirs, aes(y=audience_score, x=thtr_rel_year, color=director))+
geom_point()+
geom_line() +
labs(y = "Audience Score", x = "")+
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
win_year_intersects
p3= ggplot(best_dirs, aes(y=imdb_rating, x=thtr_rel_year, color=director))+
geom_point()+
geom_line() +
labs(y = "IMDB Rating", x = "Film Release Year")+
win_year_intersects
ggarrange(p1, p2, p3, nrow = 3, common.legend = TRUE, legend="bottom")